Exploratory of variables

cancer_df = read_csv("./data/Cancer_Registry.csv") %>% 
  janitor::clean_names() %>% 
  select(target_death_rate, everything()) %>% 
  separate(geography, c("county", "state"), sep = ",") %>% 
  mutate(county = as.factor(county), 
         state = as.factor(state),
         pct_case_count = avg_ann_count / pop_est2015*100000) %>% 
  select(target_death_rate, pct_case_count, everything())
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   avgDeathsPerYear = col_integer(),
##   medIncome = col_integer(),
##   popEst2015 = col_integer(),
##   binnedInc = col_character(),
##   Geography = col_character()
## )
## See spec(...) for full column specifications.

Percentage of annul case dignosed count plot

plot_count_pct =
  cancer_df %>% 
  ggplot(aes(y = pct_case_count, x = target_death_rate, color = state)) +
  geom_point() 
  #geom_smooth(se = F)
ggplotly(plot_count_pct)

Incidence rate plot

plot_incidence = 
  cancer_df %>% 
  ggplot(aes(x = incidence_rate, y = target_death_rate, color = state)) +
  geom_point() 
  #geom_smooth(se = F)
  ggplotly(plot_incidence)
# Influential points in the dataset, state Flordia and Virginia.

Income plot

plot_income = 
  cancer_df %>% 
  ggplot(aes(x = med_income, y = target_death_rate, color = state)) +
  geom_point() 
  #geom_smooth(se = F)
  ggplotly(plot_income)

Age plots

plot_age_1 =
  cancer_df %>% 
  ggplot(aes(x = median_age, y = target_death_rate)) +
  geom_point() +
  geom_smooth(se = F)
ggplotly(plot_age_1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# error data in this column, larger than 100

cancer_df %>%
  filter(median_age < 100) %>% 
  ggplot(aes(x = median_age)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot_age_2 =
  cancer_df %>% 
  ggplot(aes(x = median_age_male, y = target_death_rate)) +
  geom_point() +
  geom_smooth(se = F)
ggplotly(plot_age_2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot_age_3 =
  cancer_df %>% 
  ggplot(aes(x = median_age_female, y = target_death_rate)) +
  geom_point() +
  geom_smooth(se = F)
ggplotly(plot_age_3)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cancer_df %>% 
  select(-county, -state, -binned_inc) %>% 
  cor() %>% 
  as.tibble() 
## # A tibble: 33 x 33
##    target_death_ra… pct_case_count avg_ann_count avg_deaths_per_…
##               <dbl>          <dbl>         <dbl>            <dbl>
##  1          1             -0.0578        -0.144           -0.0907
##  2         -0.0578         1              0.161           -0.0589
##  3         -0.144          0.161          1                0.939 
##  4         -0.0907        -0.0589         0.939            1     
##  5          0.449          0.0225         0.0736           0.0627
##  6         -0.429          0.0291         0.269            0.223 
##  7         -0.120         -0.0518         0.927            0.978 
##  8          0.429         -0.123         -0.136           -0.0669
##  9         -0.0223        -0.00481        0.0821           0.0635
## 10          0.00438        0.0375        -0.0241          -0.0246
## # ... with 23 more rows, and 29 more variables: incidence_rate <dbl>,
## #   med_income <dbl>, pop_est2015 <dbl>, poverty_percent <dbl>,
## #   study_per_cap <dbl>, median_age <dbl>, median_age_male <dbl>,
## #   median_age_female <dbl>, avg_household_size <dbl>,
## #   percent_married <dbl>, pct_no_hs18_24 <dbl>, pct_hs18_24 <dbl>,
## #   pct_some_col18_24 <dbl>, pct_bach_deg18_24 <dbl>, pct_hs25_over <dbl>,
## #   pct_bach_deg25_over <dbl>, pct_employed16_over <dbl>,
## #   pct_unemployed16_over <dbl>, pct_private_coverage <dbl>,
## #   pct_private_coverage_alone <dbl>, pct_emp_priv_coverage <dbl>,
## #   pct_public_coverage <dbl>, pct_public_coverage_alone <dbl>,
## #   pct_white <dbl>, pct_black <dbl>, pct_asian <dbl>,
## #   pct_other_race <dbl>, pct_married_households <dbl>, birth_rate <dbl>
lm(target_death_rate ~ avg_ann_count * pop_est2015, data = cancer_df) %>% 
  summary()
## 
## Call:
## lm(formula = target_death_rate ~ avg_ann_count * pop_est2015, 
##     data = cancer_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -121.215  -17.194   -0.065   15.969  182.511 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.812e+02  5.692e-01 318.377  < 2e-16 ***
## avg_ann_count             -4.219e-03  9.348e-04  -4.513 6.63e-06 ***
## pop_est2015               -2.315e-06  4.606e-06  -0.503    0.615    
## avg_ann_count:pop_est2015  4.743e-10  1.061e-10   4.468 8.17e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.37 on 3043 degrees of freedom
## Multiple R-squared:  0.02817,    Adjusted R-squared:  0.02721 
## F-statistic:  29.4 on 3 and 3043 DF,  p-value: < 2.2e-16